In [1]:
!pip install pandas-profiling
In [2]:
import pandas_profiling
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')
In [3]:
train = pd.read_csv('../input/train.csv').drop('Id', axis=1)
test = pd.read_csv('../input/test.csv').drop('Id', axis=1)
print(f'Train: {train.shape}')
print(f'Test: {test.shape}')
Train: (1460, 80)
Test: (1459, 79)
In [4]:
train.profile_report(style={'full_width':True})
Out[4]:

In [5]:
df_corr = train.corr()['SalePrice'].drop('SalePrice')
ax = df_corr.sort_values(ascending=True).plot(kind='barh',
                                              title='Correlation between Features and Sale Price',
                                              figsize=(16,8))
plt.xlabel('Correlation'); plt.ylabel('Features');
In [6]:
X = train.copy()
y = X.pop('SalePrice')

cat = X.describe(include='O').columns
num = X.describe().columns

Models¶

In [7]:
def fit_predict(model, i=10):
    l = []
    for _ in range(i):
        train_X, test_X, train_y, test_y = train_test_split(X.fillna(0)[num],y)
        model.fit(train_X, train_y)
        yhat = model.predict(test_X)
        mae = int(MAE(yhat, test_y))
        l.append(mae)
    mean = int(np.mean(l))
    print('Iter:', *l)
    print(f'Mean: {mean}')
    name = str(lr.__class__)[16:-2]
    pd.DataFrame(l).plot(kind='line', figsize=(10,2), title=str(name))
In [8]:
lr = LinearRegression()
fit_predict(lr)
Iter: 22519 20854 24211 20835 23402 20494 21658 21038 21847 20936
Mean: 21779
In [9]:
rt = RandomForestRegressor(random_state=1)
fit_predict(rt)
Iter: 19612 20070 20290 17772 19589 20548 20236 20179 17803 18848
Mean: 19494
In [10]:
gbr = GradientBoostingRegressor(max_depth=5)
fit_predict(gbr)
Iter: 18108 16525 15687 16818 18031 17041 17590 16206 17642 17685
Mean: 17133
In [13]:
def fillna(df):
    fillna_dict = {
        'Electrical': df['Electrical'].mode()[0],
        'FireplaceQu': 'NA',
        'GarageType': 'NA',
        'GarageYrBlt': df['GarageYrBlt'].mode()[0], #change
        'GarageFinish': 'Unf',
        'GarageCars': 0,
        'GarageArea': 0,
        'GarageQual': 'NA',
        'GarageCond': 'NA',
        'PoolQC': 'NA',
        'Fence': 'NA',
        'MiscFeature': 'NA',
        'BsmtQual': 'NA',
        'BsmtCond': 'NA',
        'BsmtExposure': 'NA',
        'BsmtFinType1': 'NA',
        'BsmtFinType2': 'NA',
        'MasVnrType': 'None',
        'Alley': 'NA',
        'MasVnrArea': 0.0,
        'LotFrontage': df['LotFrontage'].min()
    }
    for k,v in fillna_dict.items():
        df[k].fillna(v, inplace=True)
    return df
In [15]:
def remap_col(remap, to_remap):
    global df
    for col in to_remap:
        df[col] = df[col].replace(remap)


df = train.copy()

to_remap_qal = ['ExterQual', 'KitchenQual', 'BsmtQual', 'HeatingQC', 'BsmtCond', 'PoolQC']
remap_qal = {
    'NA': 0, # NA
    'Po': 1, # poor
    'TA': 2, # typical
    'Fa': 3, # fair
    'Gd': 4, # good
    'Ex': 5, # excellent
}

to_remap_YN = ['CentralAir']
remap_YN = {
    'N': 0,
    'Y': 1
}

remap_col(remap_qal, to_remap_qal)
remap_col(remap_YN, to_remap_YN)

X = df[df.describe().columns]
y = X.pop('SalePrice')
In [16]:
gbr = GradientBoostingRegressor(max_depth=5)
fit_predict(gbr)
Iter: 16278 16707 17053 16089 15998 17199 17444 15541 17675 18831
Mean: 16881
In [ ]:
one_hot_encode = {}

for col in cat:
    try:
        OHE = pd.get_dummies(df[col])
        temp = pd.concat([OHE, y], axis=1)
        temp = abs(temp.corr()['SalePrice']).mean()
        if temp > .2:
            one_hot_encode[col] = temp
    except: pass

one_hot_encode = sorted(one_hot_encode.items(), key=lambda x: x[1], reverse=True)

OHE_cols = [k for k,v in one_hot_encode]
print(OHE_cols)
In [ ]:
df1 = pd.get_dummies(df, columns=OHE_cols)
X = df1.select_dtypes(include=[float, int])
y = X.pop('SalePrice')
In [ ]:
gbr = GradientBoostingRegressor(max_depth=5)
fit_predict(gbr)
In [ ]: